#  We will use this to explore the distribution
#  of sample means.

#  First, let us create a large population
source( "../gnrnd5.R")
gnrnd5(156437499904, 413003074)
#  
#  just for our reference let us look at the 
#  mean and standard deviation of this population
pop_mean <- mean( L1 )
pop_mean
source( "../pop_sd.R")
pop_sigma <- pop_sd( L1 )
pop_sigma

#  now, get 10000 samples of that population
#  and, for each sample, save the sample mean

#  We will get samples of size 36
samp_size <- 36

L2<-1:10000
for( i in 1:10000) {
  L3 <- sample( L1, samp_size )
  samp_mean <- mean( L3)
  L2[i] <- samp_mean
}

#  we maintain that the distribution of sample
#  means for samples of this size will be 
#  normal, with the same mean as the population
#  and the distribution of the sample means
#  will have a standard deviation equal to
#  the population standard deviation divided by 
#  the square root of the sample size, i.e.,
#  N( pop_mean, pop_sigma/sqrt(samp_size) )

#  let us see what we have
#  The mean of the means is
mean( L2 )
#   which should be remarkably close to 
pop_mean # the population mean

#  The standard deviation of the sample means is
pop_sd( L2 )
#   which should be remarkably close to 
pop_sigma/sqrt(samp_size)
#   so the mean and standard deviation of the 
#   sample means are right on target.  But is
#   the distribution normal?
hist( L2 )
hist( L2, main="Distribution of Sample Means",
      breaks=30)

boxplot( L2, horizontal=TRUE)
source( "../assess_normality.R")         
assess_normality( L2 ) 

###############################
#   That was great, but now go back and 
#   try the same thing for samples of 
#   size 29  (just change line 20 and then
#   run the subsequent lines through line 57 )
####################################
############################################################

#  I wonder if this had anything to do with 
#  the distribution of the original population
hist( L1 )
boxplot( L1, horizontal = TRUE)
assess_normality( L1 )
#   Clearly the original population was 
#   normal.  Let us try all of the same work
#   but this time for a population that is 
#   definitely not normal

gnrnd5(156437499901, 1418002352)
#
#   First look at the population to be sure 
#   is not normal
hist( L1 )
boxplot( L1, horizontal = TRUE)
assess_normality( L1 )
#   Clearly not normal
#   Then get our new population values

pop_mean <- mean( L1 )
pop_mean
 
pop_sigma <- pop_sd( L1 )
pop_sigma

#  now, get 10000 samples of that population
#  and, for each sample, save the sample mean

#  We will get samples of size 36
samp_size <- 36

L2<-1:10000
for( i in 1:10000) {
  L3 <- sample( L1, samp_size )
  samp_mean <- mean( L3)
  L2[i] <- samp_mean
}

#  we maintain that the distribution of sample
#  means for samples of this size will be 
#  normal, with the same mean as the population
#  and the distribution of the sample means
#  will have a standard deviation equal to
#  the population standard deviation divided by 
#  the sample size, i.e.,
#  N( pop_mean, pop_sigma/sqrt(samp_size) )

#  let us see what we have
#  The mean of the means is
mean( L2 )
#   which should be remarkably close to 
pop_mean # the population mean

#  The standard deviation of the sample means is
pop_sd( L2 )
#   which should be remarkably close to 
pop_sigma/sqrt(samp_size)
#   so the mean and standard deviation of the 
#   sample means are right on target.  But is
#   the distribution normal?
hist( L2 )
hist( L2, main="Distribution of Sample Means",
      breaks=30)

boxplot( L2, horizontal=TRUE)
         
assess_normality( L2 ) 
#  so the distribution of the sample means
#  is normal with the same expected values for
#  the mean of the mean values and the 
#  standard deviation of the mean values.


###################################
#  You could go back to line 97 and change 
#  the size of the sample to see that this
#  does not depend on the sample size (within
#  reason...you want to stay away from really
#  small samples...if the population is not
#  approximately normal, as is this case, then
#  you want to have samples of 30 or more as a 
#  general rule).
#####################################

####  Let us try a different population
####   Again, choose one that is not normal


gnrnd5(156437499902, 1418002352)
#
#   First look at the population to be sure 
#   is not normal
hist( L1 )
boxplot( L1, horizontal = TRUE)
assess_normality( L1 )
#   Clearly not normal
#   Then get our new population values

pop_mean <- mean( L1 )
pop_mean
 
pop_sigma <- pop_sd( L1 )
pop_sigma

#  now, get 10000 samples of that population
#  and, for each sample, save the sample mean

#  We will get samples of size 36
samp_size <- 36

L2<-1:10000
for( i in 1:10000) {
  L3 <- sample( L1, samp_size )
  samp_mean <- mean( L3)
  L2[i] <- samp_mean
}

#  we maintain that the distribution of sample
#  means for samples of this size will be 
#  normal, with the same mean as the population
#  and the distribution of the sample means
#  will have a standard deviation equal to
#  the population standard deviation divided by 
#  the sample size, i.e.,
#  N( pop_mean, pop_sigma/sqrt(samp_size) )

#  let us see what we have
#  The mean of the means is
mean( L2 )
#   which should be remarkably close to 
pop_mean # the population mean

#  The standard deviation of the sample means is
pop_sd( L2 )
#   which should be remarkably close to 
pop_sigma/sqrt(samp_size)
#   so the mean and standard deviation of the 
#   sample means are right on target.  But is
#   the distribution normal?
hist( L2 )
hist( L2, main="Distribution of Sample Means",
      breaks=30)

boxplot( L2, horizontal=TRUE)
          
assess_normality( L2 ) 
#  so the distribution of the sample means
#  is normal with the same expected values for
#  the mean of the mean values and the 
#  standard deviation of the mean values.


###################################
#  You could go back to line 176 and change 
#  the size of the sample to see that this
#  does not depend on the sample size (again, 
#  within reason).